Exploratory Data Analysis
# interactive trends plot
p <- plot_ly(type = "scatter", mode = "lines")
trends <- trends %>%
mutate(across(everything(), ~ ifelse(. == "<1", 0, .))) %>%
mutate(across(c("Mark Ruffalo":"Aaron Eckhart"), ~ as.numeric(.))) %>%
pivot_longer(cols = c("Mark Ruffalo":"Aaron Eckhart")) %>%
mutate(Month = ym(Month))
trends %>%
group_by(name) %>%
group_split() %>%
purrr::walk(function(df) {
p <<- add_trace(p,
data = df,
x = ~Month,
y = ~value,
name = unique(df$name),
text = ~paste(name, value),
hoverinfo = "text",
visible = "legendonly")
})
p %>%
layout(
title = "Monthly Search Interest for Selected Actors since 2004",
xaxis = list(title = "Month", range = c("2004-01-01",
"2025-03-01")),
yaxis = list(title = "Google Trend Search Interest", range = c(0, 100)),
legend = list(title = list(text = "Select an Actor"))
)
# joining
joined <- trends %>%
mutate(name = tolower(name)) %>%
left_join(actors, by = c("name"))
# by winner
joined %>%
filter(!is.na(type)) %>%
group_by(Month, type) %>% summarise(value = mean(value)) %>%
ggplot(aes(x = Month, y = value)) +
geom_line() +
labs(title = "Google Trendlines for all Oscar Nominated Actors",
subtitle = "Nominated 2004-Present") +
facet_wrap(vars(type))
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.

# This is pretty interesting - suggests that generally Oscar nominated actors are have searched more but it has leveled off (could reflect trends in Google Search overall). The cyclical pattern is also interesting and I'm guessing it coincides with the ceremony and nomination announcements!
joined %>% mutate(month = month(Month)) %>%
filter(!is.na(type)) %>%
group_by(type, month) %>%
summarise(mean_value = mean(value)) %>%
ungroup(month) %>%
slice_max(order_by = mean_value, n = 3)
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
## # A tibble: 9 × 3
## # Groups: type [3]
## type month mean_value
## <chr> <dbl> <dbl>
## 1 both 1 14.5
## 2 both 2 13.4
## 3 both 12 13.1
## 4 commercial 1 13.1
## 5 commercial 7 13.1
## 6 commercial 12 12.7
## 7 oscar 1 12.0
## 8 oscar 2 11.6
## 9 oscar 3 11.0
wide <- trends %>%
pivot_wider(names_from = Month, values_from = value) %>%
tibble::column_to_rownames("name")
ts_matrix <- as.matrix(wide)
clusters <- tsclust(ts_matrix,
type = "partitional",
k = 5, # tune this
distance = "sbd",
centroid = "shape", # required with SBD
control = partitional_control(iter.max = 50),
seed = 42)
cluster_assignments <- clusters@cluster # cluster labels for each series
distances <- clusters@cldist
# Create a data frame with rownames and cluster assignment
cluster_df <- data.frame(
name = rownames(ts_matrix),
cluster = cluster_assignments,
distance = distances
)
actor_with_cluster <- left_join(trends, cluster_df, by = "name")
actor_with_cluster %>%
group_by(Month, cluster) %>% summarise(value = mean(value)) %>%
ggplot(aes(x = Month, y = value, colour = factor(cluster))) +
geom_line(show.legend = FALSE) +
scale_y_continuous(labels = scales::comma) +
facet_wrap(vars(cluster)) + # Separate plots for each cluster
scale_color_solarized() +
theme_minimal() +
labs(title = "Time Series Clustering", colour = "Cluster")
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.

centroid_matrix <- do.call(rbind, clusters@centroids)
# Create a data frame from the matrix
centroid_df <- data.frame(Time = rep(1:ncol(centroid_matrix), nrow(centroid_matrix)),
Cluster = rep(1:nrow(centroid_matrix), each = ncol(centroid_matrix)),
Value = as.vector(centroid_matrix))
ggplot(centroid_df, aes(x = Time, y = Value, color = factor(Cluster))) +
geom_line(size = 1) +
facet_wrap(~ Cluster, ncol = 1) +
labs(title = "Cluster Centroids", x = "Time", y = "Value", color = "Cluster") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

actor_with_cluster %>% distinct(name, cluster, distance) %>%
group_by(cluster) %>%
slice_min(order_by = distance, n = 5)
## # A tibble: 25 × 3
## # Groups: cluster [5]
## name cluster distance
## <chr> <int> <dbl>
## 1 Chadwick Boseman 1 0.0156
## 2 Philip Seymour Hoffman 1 0.0166
## 3 Shi Pengyuan 1 0.0182
## 4 William T. Hurtz 1 0.0182
## 5 Carrie Fisher 1 0.0198
## 6 Joshua David Neal 2 0.0488
## 7 Bérénice Bejo 2 0.0602
## 8 Calah Lane 2 0.0736
## 9 Mickey Rourke 2 0.0799
## 10 Rooney Mara 2 0.0817
## # ℹ 15 more rows
actor_with_cluster %>% distinct(name, cluster) %>%
mutate(name = tolower(name)) %>%
left_join(actors) %>%
filter(!is.na(cluster)) %>%
group_by(cluster) %>% mutate(total = n()) %>%
group_by(cluster, nominee) %>%
summarise(prop_nominees = n()/total) %>%
filter(nominee == 1) %>% distinct()
## Joining with `by = join_by(name)`
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'cluster', 'nominee'. You can override
## using the `.groups` argument.
## # A tibble: 5 × 3
## # Groups: cluster, nominee [5]
## cluster nominee prop_nominees
## <int> <dbl> <dbl>
## 1 1 1 0.414
## 2 2 1 0.504
## 3 3 1 0.169
## 4 4 1 0.305
## 5 5 1 0.357
years_seq <- 2004:2024
cluster_df <- cluster_df %>%
distinct(name, cluster) %>%
mutate(name = tolower(name))
features <- actors %>%
left_join(cluster_df, by = ("name")) %>%
select(name, age, gender, american, cluster)
winners <- actors %>%
mutate(year_winner = str_remove_all(year_winner, "\\[|\\]")) %>%
separate_rows(year_winner, sep = ",\\s*") %>%
mutate(year_winner = as.integer(year_winner)) %>%
select(name, winner, year_winner)
# Cross join name and years_seq
winners_filled <- winners %>%
# Create a data frame of all names and years
distinct(name) %>%
expand(name, year_winner = years_seq) %>%
left_join(winners, by = c("name", "year_winner")) %>%
rename(year = year_winner) %>%
group_by(name) %>%
mutate(
won = ifelse(is.na(winner), "no", winner)
) %>%
ungroup() %>%
group_by(name) %>%
mutate(
won_previously = ifelse(lag(won != "no", default = FALSE), TRUE, FALSE)) %>%
ungroup() %>%
mutate(
won_previously = ifelse(won_previously == TRUE, 1, 0)
) %>%
group_by(name) %>%
# Fill the 'nominated_previously' column down for each name
mutate(won_previously = cummax(won_previously)) %>%
ungroup() %>%
select(-won)
nominees <- actors %>%
mutate(nominated_years = str_remove_all(nominated_years, "\\[|\\]")) %>%
separate_rows(nominated_years, sep = ",\\s*") %>%
mutate(nominated_years = as.integer(nominated_years)) %>%
select(name, nominated_years) %>%
mutate(nominee = ifelse(!is.na(nominated_years), "yes", "no"))
nominees_filled <- nominees %>%
# Create a data frame of all names and years
distinct(name) %>%
expand(name, nominated_years = years_seq) %>%
left_join(nominees, by = c("name", "nominated_years")) %>%
rename(year = nominated_years) %>%
group_by(name) %>%
mutate(
nominated = ifelse(is.na(nominee), "no", nominee) # Ensure no NAs in 'nominated' column
) %>%
ungroup() %>%
group_by(name) %>%
mutate(
nominated_previously = ifelse(lag(nominated == "yes", default = FALSE), TRUE, FALSE)
) %>%
ungroup() %>%
mutate(
nominated_previously = as.integer(nominated_previously)
) %>%
group_by(name) %>%
# Fill the 'nominated_previously' column down for each name
mutate(nominated_previously = cummax(nominated_previously)) %>%
ungroup() %>%
select(-nominated)
winners_filled <- winners_filled %>%
left_join(features, by = "name") %>%
mutate(age = year - age)
prev_win <- winners_filled %>%
select(name, year, won_previously)
nominees_filled <- nominees_filled %>%
left_join(features, by = "name") %>%
left_join(prev_win, by = c("name", "year")) %>%
mutate(age = year - age)
# Step 1: Create trend time series and compute variance
trend_ts_data_all <- trends %>%
mutate(year = year(Month),
name = tolower(name)) %>%
group_by(name, year) %>%
arrange(Month) %>%
summarise(
ts_data = list(ts(value, frequency = 12)),
var = var(value, na.rm = TRUE),
.groups = "drop"
)
# Step 2: Log zero-variance rows
zero_variance_log <- trend_ts_data_all %>%
filter(is.na(var) | var == 0)
# Step 3: Keep only valid series
trend_ts_data <- trend_ts_data_all %>%
filter(!is.na(var) & var > 0) %>%
select(-var) %>%
filter(year != 2025)
# Feature extraction + custom max spike height
trend_features_ts <- trend_ts_data %>%
mutate(
features = map(ts_data, ~ tsfeatures(.x)), # Compute all available features
max_spike_height = map_dbl(ts_data, ~ max(.x, na.rm = TRUE))
) %>%
unnest(features)
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action, :
## zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action, :
## zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action, :
## zero-variance series
## Error in if (order) coefs[order, 1L:order] else numeric() :
## argument is not interpretable as logical
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action, :
## zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action, :
## zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action, :
## zero-variance series
## Error in ar.burg.default(x, aic = aic, order.max = order.max, na.action = na.action, :
## zero-variance series
## Warning: There were 12112 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `features = map(ts_data, ~tsfeatures(.x))`.
## Caused by warning in `.f()`:
## ! Insufficient data to compute STL decomposition
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 12111 remaining warnings.
# --- 4. Merge with Oscar outcomes ---
model_data <- trend_features_ts %>%
left_join(nominees_filled, by = c("name", "year")) %>%
mutate(
nominee = factor(ifelse(is.na(nominee), 0, 1)) # factor for classification
) %>%
select(-frequency, -nperiods, -seasonal_period, -diff2_acf10, -seas_acf1)
# --- 5. Train/test split ---
set.seed(123)
data_split <- initial_split(model_data, prop = 0.8, strata = nominee)
train_data <- training(data_split)
test_data <- testing(data_split)
train_data <- train_data %>% select(-name, -ts_data, -year)
train_data_clean <- na.omit(train_data)
## undersampling
# Split the data into the two classes
non_nominee <- train_data_clean %>% filter(nominee == 0)
nominee <- train_data_clean %>% filter(nominee == 1)
# Randomly sample from the majority class (non-nominee) to match the minority class (nominee)
# Randomly sample the majority class (non-nominee) to match the minority class (nominee)
set.seed(42) # Set seed for reproducibility
# Get the same proportion of non-nominee as nominee
if (nrow(non_nominee) >= nrow(nominee)) {
set.seed(42)
non_nominee_undersampled <- non_nominee[sample(nrow(non_nominee), nrow(nominee)), ]
# Combine
balanced_data <- bind_rows(non_nominee_undersampled, nominee)
# Shuffle rows
balanced_data <- balanced_data %>% sample_frac(1)
# Confirm
table(balanced_data$nominee)
} else {
stop("Not enough non-nominee rows to sample from.")
}
##
## 0 1
## 298 298
# Combine the undersampled non-nominee with the full nominee class
balanced_data <- bind_rows(non_nominee_undersampled, nominee)
# Check the new balance of classes
table(balanced_data$nominee)
##
## 0 1
## 298 298
balanced_data$cluster <- as.factor(balanced_data$cluster)
balanced_data$gender <- as.factor(balanced_data$gender)
balanced_data$american <- as.factor(balanced_data$american)
balanced_data$won_previously <- as.factor(balanced_data$won_previously)
balanced_data$nominated_previously <- as.factor(balanced_data$nominated_previously)
balanced_data_1 <- balanced_data
balanced_data_2 <- balanced_data %>% select(-age, -gender, -american, -nominated_previously, -won_previously)
set.seed(1234)
# modeling all predictors
data_split <- initial_split(balanced_data, prop = 0.8, strata = nominee)
train_data <- training(data_split)
test_data <- testing(data_split)
rf_model <- randomForest(nominee ~ ., data = train_data)
# Print model summary
print(rf_model)
##
## Call:
## randomForest(formula = nominee ~ ., data = train_data)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 28.78%
## Confusion matrix:
## 0 1 class.error
## 0 161 77 0.3235294
## 1 60 178 0.2521008
predictions <- predict(rf_model, newdata = test_data)
# Confusion Matrix
conf_matrix <- table(Predicted = predictions, Actual = test_data$nominee)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 45 20
## 1 15 40
# Accuracy
accuracy <- sum(predictions == test_data$nominee) / nrow(test_data)
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy: 0.708333333333333"
# You can use the caret package for more metrics like precision, recall, and F1 score
conf_matrix_caret <- confusionMatrix(predictions, test_data$nominee)
# Print evaluation metrics
print(conf_matrix_caret)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 45 20
## 1 15 40
##
## Accuracy : 0.7083
## 95% CI : (0.6184, 0.7877)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 2.866e-06
##
## Kappa : 0.4167
##
## Mcnemar's Test P-Value : 0.499
##
## Sensitivity : 0.7500
## Specificity : 0.6667
## Pos Pred Value : 0.6923
## Neg Pred Value : 0.7273
## Prevalence : 0.5000
## Detection Rate : 0.3750
## Detection Prevalence : 0.5417
## Balanced Accuracy : 0.7083
##
## 'Positive' Class : 0
##
## MeanDecreaseGini
## trend 14.105240
## spike 14.384914
## linearity 46.933497
## curvature 20.636132
## e_acf1 12.100508
## e_acf10 11.590256
## entropy 9.266528
## x_acf1 13.594978
## x_acf10 11.704599
## diff1_acf1 13.922044
## diff1_acf10 11.871913
## diff2_acf1 15.846169
## max_spike_height 11.997186
## nominated_previously 3.255137
## age 14.057697
## gender 1.465645
## american 1.788679
## cluster 7.852584
## won_previously 1.127954

# modeling just time series predictors
data_split <- initial_split(balanced_data_2, prop = 0.8, strata = nominee)
train_data <- training(data_split)
test_data <- testing(data_split)
rf_model <- randomForest(nominee ~ ., data = train_data)
# Print model summary
print(rf_model)
##
## Call:
## randomForest(formula = nominee ~ ., data = train_data)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 30.46%
## Confusion matrix:
## 0 1 class.error
## 0 161 77 0.3235294
## 1 68 170 0.2857143
predictions <- predict(rf_model, newdata = test_data)
# Confusion Matrix
conf_matrix <- table(Predicted = predictions, Actual = test_data$nominee)
print(conf_matrix)
## Actual
## Predicted 0 1
## 0 36 17
## 1 24 43
# Accuracy
accuracy <- sum(predictions == test_data$nominee) / nrow(test_data)
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy: 0.658333333333333"
# You can use the caret package for more metrics like precision, recall, and F1 score
conf_matrix_caret <- confusionMatrix(predictions, test_data$nominee)
# Print evaluation metrics
print(conf_matrix_caret)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 36 17
## 1 24 43
##
## Accuracy : 0.6583
## 95% CI : (0.5662, 0.7424)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 0.0003334
##
## Kappa : 0.3167
##
## Mcnemar's Test P-Value : 0.3487367
##
## Sensitivity : 0.6000
## Specificity : 0.7167
## Pos Pred Value : 0.6792
## Neg Pred Value : 0.6418
## Prevalence : 0.5000
## Detection Rate : 0.3000
## Detection Prevalence : 0.4417
## Balanced Accuracy : 0.6583
##
## 'Positive' Class : 0
##
## MeanDecreaseGini
## trend 15.81031
## spike 16.26184
## linearity 50.35193
## curvature 21.73266
## e_acf1 13.97131
## e_acf10 11.87024
## entropy 10.72542
## x_acf1 14.57469
## x_acf10 12.76247
## diff1_acf1 15.03577
## diff1_acf10 13.60832
## diff2_acf1 16.75506
## max_spike_height 13.76074
## cluster 10.26029
